Source code for nlp_architect.models.memn2n_dialogue

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
from __future__ import absolute_import
from __future__ import division

import tensorflow as tf
from six.moves import range

[docs]def zero_nil_slot(t): """ Overwrites the nil_slot (first row) of the input Tensor with zeros. The nil_slot is a dummy slot and should not be trained and influence the training algorithm. """ t = tf.convert_to_tensor(t, name="t") s = tf.shape(t)[1] z = tf.zeros(tf.stack([1, s])) return tf.concat(axis=0, values=[z, tf.slice(t, [1, 0], [-1, -1])])
[docs]class MemN2N_Dialog(object): """End-To-End Memory Network.""" def __init__(self, batch_size, vocab_size, sentence_size, memory_size, embedding_size, num_cands, max_cand_len, hops=3, max_grad_norm=40.0, nonlin=None, initializer=tf.random_normal_initializer(stddev=0.1), optimizer=tf.train.AdamOptimizer(learning_rate=0.001, epsilon=1e-8), session=tf.Session(), name='MemN2N_Dialog'): """Creates an End-To-End Memory Network for Goal Oriented Dialog Args: cands: Encoded candidate answers batch_size: The size of the batch. vocab_size: The size of the vocabulary (should include the nil word). The nil word one-hot encoding should be 0. sentence_size: The max size of a sentence in the data. All sentences should be padded to this length. If padding is required it should be done with nil one-hot encoding (0). memory_size: The max size of the memory. Since Tensorflow currently does not support jagged arrays all memories must be padded to this length. If padding is required, the extra memories should be empty memories; memories filled with the nil word ([0, 0, 0, ......, 0]). embedding_size: The size of the word embedding. hops: The number of hops. A hop consists of reading and addressing a memory slot. Defaults to `3`. max_grad_norm: Maximum L2 norm clipping value. Defaults to `40.0`. nonlin: Non-linearity. Defaults to `None`. initializer: Weight initializer. Defaults to `tf.random_normal_initializer(stddev=0.1)`. optimizer: Optimizer algorithm used for SGD. Defaults to `tf.train.AdamOptimizer(learning_rate=1e-2)`. session: Tensorflow Session the model is run with. Defaults to `tf.Session()`. name: Name of the End-To-End Memory Network. Defaults to `MemN2N`. """ self._batch_size = batch_size self._vocab_size = vocab_size self._sentence_size = sentence_size self._memory_size = memory_size self._embedding_size = embedding_size self._max_cand_len = max_cand_len self._num_cands = num_cands self._hops = hops self._max_grad_norm = max_grad_norm self._nonlin = nonlin self._init = initializer self._name = name self._build_inputs() self._build_vars() self._opt = optimizer # cross entropy logits = self._inference(self._stories, self._queries) # (batch_size, vocab_size) cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf.cast(self._answers, tf.float32), name="cross_entropy") # loss op cross_entropy_sum = tf.reduce_sum(cross_entropy, name="cross_entropy_sum") # gradient pipeline grads_and_vars = self._opt.compute_gradients(cross_entropy_sum) grads_and_vars = [(tf.clip_by_norm(g, self._max_grad_norm), v) for g, v in grads_and_vars] nil_grads_and_vars = [] for g, v in grads_and_vars: if in self._nil_vars: nil_grads_and_vars.append((zero_nil_slot(g), v)) else: nil_grads_and_vars.append((g, v)) train_op = self._opt.apply_gradients(nil_grads_and_vars, name="train_op") # predict ops predict_op = tf.argmax(logits, 1, name="predict_op") predict_proba_op = tf.nn.softmax(logits, name="predict_proba_op") predict_log_proba_op = tf.log(predict_proba_op, name="predict_log_proba_op") # assign ops self.loss_op = cross_entropy_sum self.predict_op = predict_op self.predict_proba_op = predict_proba_op self.predict_log_proba_op = predict_log_proba_op self.train_op = train_op init_op = tf.global_variables_initializer() self._sess = session self.saver = tf.train.Saver(max_to_keep=1) def _build_inputs(self): self._stories = tf.placeholder(tf.int32, [None, None, self._sentence_size], name="stories") self._queries = tf.placeholder(tf.int32, [None, self._sentence_size], name="queries") self._answers = tf.placeholder(tf.int32, [None, self._num_cands], name="answers") self._cands = tf.placeholder(tf.int32, [None, self._num_cands, self._max_cand_len], name="candidate_answers") def _build_vars(self): with tf.variable_scope(self._name): nil_word_slot = tf.zeros([1, self._embedding_size]) A = tf.concat(axis=0, values=[nil_word_slot, self._init([self._vocab_size - 1, self._embedding_size])]) W = tf.concat(axis=0, values=[nil_word_slot, self._init([self._vocab_size - 1, self._embedding_size])]) self.LUT_A = tf.Variable(A, name="LUT_A") self.LUT_W = tf.Variable(W, name="LUT_W") # Dont use projection for layerwise weight sharing self.R_proj = tf.Variable(self._init([self._embedding_size, self._embedding_size]), name="R_proj") self._nil_vars = set([,]) def _inference(self, stories, queries): with tf.variable_scope(self._name): q_emb = tf.nn.embedding_lookup(self.LUT_A, queries) u_0 = tf.reduce_sum(q_emb, 1) u = [u_0] for _ in range(self._hops): m_emb_A = tf.nn.embedding_lookup(self.LUT_A, stories) m_A = tf.reduce_sum(m_emb_A, 2) # hack to get around no reduce_dot u_temp = tf.transpose(tf.expand_dims(u[-1], -1), [0, 2, 1]) dotted = tf.reduce_sum(m_A * u_temp, 2) # Calculate probabilities probs = tf.nn.softmax(dotted) probs_temp = tf.transpose(tf.expand_dims(probs, -1), [0, 2, 1]) # Reuse A for the output memory encoding c_temp = tf.transpose(m_A, [0, 2, 1]) o_k = tf.reduce_sum(c_temp * probs_temp, 2) # Project hidden state, and add update u_k = tf.matmul(u[-1], self.R_proj) + o_k # nonlinearity if self._nonlin: u_k = self._nonlin(u_k) u.append(u_k) cands_emb = tf.nn.embedding_lookup(self.LUT_W, self._cands) cands_emb_sum = tf.reduce_sum(cands_emb, 2) logits = tf.reshape(tf.matmul(tf.expand_dims(u_k, 1), tf.transpose(cands_emb_sum, [0, 2, 1])), (-1, cands_emb_sum.shape[1])) return logits
[docs] def batch_fit(self, stories, queries, answers, cands): """Runs the training algorithm over the passed batch Args: stories: Tensor (None, memory_size, sentence_size) queries: Tensor (None, sentence_size) answers: Tensor (None, vocab_size) Returns: loss: floating-point number, the loss computed for the batch """ feed_dict = {self._stories: stories, self._queries: queries, self._answers: answers, self._cands: cands} loss, _ =[self.loss_op, self.train_op], feed_dict=feed_dict) return loss
[docs] def predict(self, stories, queries, cands): """Predicts answers as one-hot encoding. Args: stories: Tensor (None, memory_size, sentence_size) queries: Tensor (None, sentence_size) Returns: answers: Tensor (None, vocab_size) """ feed_dict = {self._stories: stories, self._queries: queries, self._cands: cands} return, feed_dict=feed_dict)